To complement the fixed window analysis focused on emergence and elimination events, we also want to show that the dynamics of EWS reflect those of \(R_E\). I estimated \(R_E\) over the observations via particle filtering and state estimation. Here, I read in those values for each city and correlate them with top-performing EWS. Top-performing EWS are those with high AUC in the emergence and elimination scenarios. Note that \(R_E\) was calculated as:

\[ R_E = \frac{\beta_t}{\gamma} \frac{S_t}{N_t}. \]

library(tidyverse)
cities <- c("Agadez", "Maradi", "Niamey", "Zinder")  # the four city names
reffs <- tibble()  # empty tibble for storage
for(do_city in cities){
  fname <- paste0("../../results/filtered-states-", do_city, ".RDS")
  raw_filter <- readRDS(fname)
  reff_id <- which(raw_filter$state == "effective_r_seasonal")
  tmp_reff <- raw_filter$data[[reff_id]] %>%
    dplyr::select(time, med, week, date) %>%
    dplyr::mutate(city = do_city)
  
  reffs <- bind_rows(reffs, tmp_reff)
}
ggplot(reffs, aes(x = date, y = med, color = city)) +
  geom_hline(aes(yintercept = 1), linetype = 2, color = "grey35") +
  geom_line() +
  labs(x = "date", y = expression(italic(R)[E]), 
       title = "Effective reproduction number over time") +
  ggthemes::scale_color_colorblind()

Now I calculate the EWS at each observation time using the spaero::get_stats() function. I use a bandwidth of 30 weeks and set backward_only = TRUE so that only past data are used when calculating EWS.

library(spaero)
fname <- "../../data/clean-data/weekly-measles-incidence-niger-cities-clean.RDS"
measles_data <- readRDS(fname) %>%
  filter(year > 1994)  # drop first NA year, only used for modeling
all_stats <- tibble()  # empty tibble for storage
for(do_region in unique(measles_data$region)){
  
  cases <- measles_data %>%
    filter(region == do_region) %>%
    pull(cases)
  
  city_stats <- spaero::get_stats(
    x = cases,
    center_trend = "local_constant", 
    center_kernel = "uniform", 
    center_bandwidth = 30, 
    stat_trend = "local_constant", 
    stat_kernel = "uniform", 
    stat_bandwidth = 30, 
    lag = 1, 
    backward_only = TRUE
  )$stats
  
  city_stats_tb <- as_tibble(city_stats) %>%
    mutate(
      time_iter = 1:n(),
      date = unique(measles_data$date)
    ) %>%
    gather(key = ews, value = value, -time_iter, -date) %>%
    mutate(region = do_region)
  
  all_stats <- bind_rows(all_stats, city_stats_tb)
}
# best_ews <- c("autocorrelation", "autocovariance", "mean", "variance")
# all_stats <- all_stats %>%
#   dplyr::filter(ews %in% best_ews)

Now I look at scatterplots of \(R_E\) versus EWS.

reffs <- reffs %>%
  dplyr::mutate(region = paste0(city, " (City)")) %>%
  dplyr::select(-city)
ews_reffs <- left_join(all_stats, reffs)
Joining, by = c("date", "region")
ews_reffs_subcrit <- ews_reffs %>%
  filter(med < 1)
ggplot(ews_reffs_subcrit, aes(x = med, y = value, color = region)) +
  geom_point(alpha = 0.2) +
  facet_wrap(~ews, scales = "free") +
  ggthemes::scale_color_colorblind()

# for(do_region in unique(ews_reffs$region)){
#   print(ggplot(filter(ews_reffs, region == do_region & ews == "variance"), 
#        aes(x = med, y = value, color = lubridate::week(date))) +
#     geom_point() +
#     facet_wrap(~lubridate::year(date), scales = "free") +
#     labs(x = expression(R[E]), y = "EWS value", title = do_region) +
#     scale_color_viridis_c(name = "week of year", direction = 1))
# }

No clear signal in those plots. Next I’ll look at the first difference of each EWS (x) between time t and \(t+1\). I calculate the difference in three ways:

  1. Simple difference: \(x(t) - x(t-1)\)
  2. Ratio: \(\frac{x(t)}{x(t-1)}\)
  3. Log ratio: \(\text{log}\left(\frac{x(t)}{x(t-1)}\right)\)
ews_reffs_diffed <- ews_reffs %>%
  group_by(ews, region) %>%
  mutate(
    diff_value = value - lag(value, 1),
    ratio_value = value / lag(value, 1),
    log_ratio_value = log(ratio_value)
  ) %>%
  rename("Reff" = med)
NaNs producedNaNs producedNaNs producedNaNs producedNaNs producedNaNs producedNaNs producedNaNs producedNaNs producedNaNs producedNaNs producedNaNs producedNaNs producedNaNs producedNaNs producedNaNs producedNaNs produced

Correlations between \(R_E\) and first differenced EWS

ews_reffs_diffed %>%
  filter(Reff < 1) %>%
  ggplot(aes(x = Reff, y = round(diff_value,1))) +
  geom_point(aes(color = lubridate::week(date)), alpha = 0.7) +
  facet_wrap(ews~region, scales = "free", ncol = 4) +
  scale_color_viridis_c(name = "Week of year", limits = c(1,52), 
                        breaks = c(1,26,52)) +
  labs(y = "First difference of EWS", x = expression(R[E])) +
  theme(legend.position = "top")

Correlations between \(R_E\) and “first ratioed” EWS

ews_reffs_diffed %>%
  filter(Reff < 1) %>%
  ggplot(aes(x = Reff, y = round(ratio_value,1))) +
  geom_point(aes(color = lubridate::week(date)), alpha = 0.7) +
  facet_wrap(ews~region, scales = "free", ncol = 4) +
  scale_color_viridis_c(name = "Week of year", limits = c(1,52), 
                        breaks = c(1,26,52)) +
  labs(y = "First ratio of EWS", x = expression(R[E])) +
  theme(legend.position = "top")

Correlations between \(R_E\) and log of “first ratioed” EWS

ews_reffs_diffed %>%
  filter(Reff < 1) %>%
  ggplot(aes(x = Reff, y = log_ratio_value)) +
  geom_point(aes(color = lubridate::week(date)), alpha = 0.7) +
  facet_wrap(ews~region, scales = "free", ncol = 4) +
  scale_color_viridis_c(name = "Week of year", limits = c(1,52), 
                        breaks = c(1,26,52)) +
  labs(y = "log(First ratio of EWS)", x = expression(R[E])) +
  theme(legend.position = "top")

Correlations between \(R_E\) and log of “first ratioed” EWS, near-zeros removed

It looks like there might be trends when the EWS are actually “moving”, that is, when \(\text{log}\left(\frac{x(t)}{x(t-1)} \right) \ne 0\) or approximately so. So, I’ll remove those values that are close to zero by assuming all values less than \(|0.05|\) are essentially zero. Now some patterns emerge as expected.

ews_reffs_diffed %>%
  filter(Reff < 1) %>%
  filter(round(log_ratio_value,1) != 0) %>%
  ggplot(aes(x = Reff, y = log_ratio_value)) +
  geom_point(aes(color = lubridate::week(date)), alpha = 0.7) +
  stat_smooth(method = "lm", se = FALSE, linetype = 1, color = "black") +
  facet_wrap(ews~region, scales = "free", ncol = 4) +
  scale_color_viridis_c(name = "Week of year", limits = c(1,52), 
                        breaks = c(1,26,52)) +
  labs(y = "log(First ratio of EWS)", x = expression(R[E])) +
  theme(legend.position = "top")

Look at lag in dynamics

Due to the stochastic seasonal dynamics, there may be a lag between \(R_E\) and EWS that are calculated from raw case data. To look at this, I will plot \(R_E\) and each EWS on the same plot, with peaks in each year highlighted. These plots will come from simulated series where we know that the \(R_E\) and the dynamics are explicitly linked (unlike \(R_E\) estimates from particle filtering with the real data).

all_sim_files <- list.files("../../simulations/")
sim_ids <- grep("bootstrap*", all_sim_files)
sim_files <- all_sim_files[sim_ids]
all_sims <- tibble()  # empty tibble for storage
for(do_file in sim_files){
  tmp_sim <- readRDS(paste0("../../simulations/", do_file)) %>%
    filter(sim == 1) %>%
    unnest() %>%
    mutate(city = str_sub(do_file, 16, -5))
  
  all_sims <- bind_rows(all_sims, tmp_sim)
}
sim_ews <- tibble()  # empty storage df
for(do_city in unique(all_sims$city)){
  tmp_data <- all_sims %>%
    filter(city == do_city)
  
  tmp_cases <- tmp_data$reports
  
  tmp_stats <- spaero::get_stats(
    x = tmp_cases,
    center_trend = "local_constant", 
    center_kernel = "uniform", 
    center_bandwidth = 30, 
    stat_trend = "local_constant", 
    stat_kernel = "uniform", 
    stat_bandwidth = 30, 
    lag = 1, 
    backward_only = TRUE
  )$stats
  
  sim_ews <- sim_ews %>%
    bind_rows(
      tmp_data %>% bind_cols(as_tibble(tmp_stats))
    )
}
sim_ews <- sim_ews %>%
  mutate(critical = ifelse(RE_seas < 1, FALSE, TRUE))
for(do_city in unique(sim_ews$city)){
  tmp <- filter(sim_ews, city == do_city)
  x <- tmp$time
  y <- tmp$RE_seas
  y2 <- log(tmp$variance)
  color <- ifelse(tmp$critical == TRUE, "red", "blue")
  
  par(mar = c(5,5,2,5))
  plot(x, y, type = "n", ylab = expression(italic(R)[E]), xlab = "Date", main = do_city)
  segments(x0 = x, y0 = y, x1 = x+diff(x), y1 = y+diff(y), col = color)
  par(new = T)
  plot(x, y2, type = "l", axes=F, xlab=NA, ylab=NA, lwd = 1.5)
  axis(side = 4)
  mtext(side = 4, line = 3, "Log variance")
}

longer object length is not a multiple of shorter object lengthlonger object length is not a multiple of shorter object length

longer object length is not a multiple of shorter object lengthlonger object length is not a multiple of shorter object length

longer object length is not a multiple of shorter object lengthlonger object length is not a multiple of shorter object length

---
title: "Correlations between EWS and effective reproduction number"
output: html_notebook
date: 2019-01-22
---

To complement the fixed window analysis focused on emergence and elimination events, we also want to show that the dynamics of EWS reflect those of $R_E$.
I estimated $R_E$ over the observations via particle filtering and state estimation.
Here, I read in those values for each city and correlate them with top-performing EWS.
Top-performing EWS are those with high AUC in the emergence and elimination scenarios.
Note that $R_E$ was calculated as:

$$
R_E = \frac{\beta_t}{\gamma} \frac{S_t}{N_t}.
$$

```{r load-re}
library(tidyverse)

cities <- c("Agadez", "Maradi", "Niamey", "Zinder")  # the four city names
reffs <- tibble()  # empty tibble for storage

for(do_city in cities){
  fname <- paste0("../../results/filtered-states-", do_city, ".RDS")
  raw_filter <- readRDS(fname)
  reff_id <- which(raw_filter$state == "effective_r_seasonal")
  tmp_reff <- raw_filter$data[[reff_id]] %>%
    dplyr::select(time, med, week, date) %>%
    dplyr::mutate(city = do_city)
  
  reffs <- bind_rows(reffs, tmp_reff)
}

ggplot(reffs, aes(x = date, y = med, color = city)) +
  geom_hline(aes(yintercept = 1), linetype = 2, color = "grey35") +
  geom_line() +
  labs(x = "date", y = expression(italic(R)[E]), 
       title = "Effective reproduction number over time") +
  ggthemes::scale_color_colorblind()
```

Now I calculate the EWS at each observation time using the `spaero::get_stats()` function.
I use a bandwidth of 30 weeks and set `backward_only = TRUE` so that only past data are used when calculating EWS.

```{r ews-calc}
library(spaero)

fname <- "../../data/clean-data/weekly-measles-incidence-niger-cities-clean.RDS"
measles_data <- readRDS(fname) %>%
  filter(year > 1994)  # drop first NA year, only used for modeling

all_stats <- tibble()  # empty tibble for storage

for(do_region in unique(measles_data$region)){
  
  cases <- measles_data %>%
    filter(region == do_region) %>%
    pull(cases)
  
  city_stats <- spaero::get_stats(
    x = cases,
    center_trend = "local_constant", 
    center_kernel = "uniform", 
    center_bandwidth = 30, 
    stat_trend = "local_constant", 
    stat_kernel = "uniform", 
    stat_bandwidth = 30, 
    lag = 1, 
    backward_only = TRUE
  )$stats
  
  city_stats_tb <- as_tibble(city_stats) %>%
    mutate(
      time_iter = 1:n(),
      date = unique(measles_data$date)
    ) %>%
    gather(key = ews, value = value, -time_iter, -date) %>%
    mutate(region = do_region)
  
  all_stats <- bind_rows(all_stats, city_stats_tb)
}

# best_ews <- c("autocorrelation", "autocovariance", "mean", "variance")
# all_stats <- all_stats %>%
#   dplyr::filter(ews %in% best_ews)
```

Now I look at scatterplots of $R_E$ versus EWS.

```{r scatters}
reffs <- reffs %>%
  dplyr::mutate(region = paste0(city, " (City)")) %>%
  dplyr::select(-city)

ews_reffs <- left_join(all_stats, reffs)
ews_reffs_subcrit <- ews_reffs %>%
  filter(med < 1)

ggplot(ews_reffs_subcrit, aes(x = med, y = value, color = region)) +
  geom_point(alpha = 0.2) +
  facet_wrap(~ews, scales = "free") +
  ggthemes::scale_color_colorblind()

# for(do_region in unique(ews_reffs$region)){
#   print(ggplot(filter(ews_reffs, region == do_region & ews == "variance"), 
#        aes(x = med, y = value, color = lubridate::week(date))) +
#     geom_point() +
#     facet_wrap(~lubridate::year(date), scales = "free") +
#     labs(x = expression(R[E]), y = "EWS value", title = do_region) +
#     scale_color_viridis_c(name = "week of year", direction = 1))
# }
```

No clear signal in those plots.
Next I'll look at the first difference of each EWS (*x*) between time *t* and $t+1$.
I calculate the difference in three ways:

1. Simple difference: $x(t) - x(t-1)$
2. Ratio: $\frac{x(t)}{x(t-1)}$
3. Log ratio: $\text{log}\left(\frac{x(t)}{x(t-1)}\right)$

```{r calc-ews-diffs}
ews_reffs_diffed <- ews_reffs %>%
  group_by(ews, region) %>%
  mutate(
    diff_value = value - lag(value, 1),
    ratio_value = value / lag(value, 1),
    log_ratio_value = log(ratio_value)
  ) %>%
  rename("Reff" = med)
```

### Correlations between $R_E$ and first differenced EWS

```{r first-diff-scatters, fig.height=20, fig.width=8}
ews_reffs_diffed %>%
  filter(Reff < 1) %>%
  ggplot(aes(x = Reff, y = round(diff_value,1))) +
  geom_point(aes(color = lubridate::week(date)), alpha = 0.7) +
  facet_wrap(ews~region, scales = "free", ncol = 4) +
  scale_color_viridis_c(name = "Week of year", limits = c(1,52), 
                        breaks = c(1,26,52)) +
  labs(y = "First difference of EWS", x = expression(R[E])) +
  theme(legend.position = "top")
```

### Correlations between $R_E$ and "first ratioed" EWS

```{r fig.height=20, fig.width=8}
ews_reffs_diffed %>%
  filter(Reff < 1) %>%
  ggplot(aes(x = Reff, y = round(ratio_value,1))) +
  geom_point(aes(color = lubridate::week(date)), alpha = 0.7) +
  facet_wrap(ews~region, scales = "free", ncol = 4) +
  scale_color_viridis_c(name = "Week of year", limits = c(1,52), 
                        breaks = c(1,26,52)) +
  labs(y = "First ratio of EWS", x = expression(R[E])) +
  theme(legend.position = "top")
```

### Correlations between $R_E$ and log of "first ratioed" EWS

```{r fig.height=20, fig.width=8}
ews_reffs_diffed %>%
  filter(Reff < 1) %>%
  ggplot(aes(x = Reff, y = log_ratio_value)) +
  geom_point(aes(color = lubridate::week(date)), alpha = 0.7) +
  facet_wrap(ews~region, scales = "free", ncol = 4) +
  scale_color_viridis_c(name = "Week of year", limits = c(1,52), 
                        breaks = c(1,26,52)) +
  labs(y = "log(First ratio of EWS)", x = expression(R[E])) +
  theme(legend.position = "top")
```

### Correlations between $R_E$ and log of "first ratioed" EWS, near-zeros removed

It looks like there might be trends when the EWS are actually "moving", that is, when $\text{log}\left(\frac{x(t)}{x(t-1)} \right) \ne 0$ or approximately so.
So, I'll remove those values that are close to zero by assuming all values less than $|0.05|$ are essentially zero.
Now some patterns emerge as expected.

```{r fig.height=20, fig.width=8}
ews_reffs_diffed %>%
  filter(Reff < 1) %>%
  filter(round(log_ratio_value,1) != 0) %>%
  ggplot(aes(x = Reff, y = log_ratio_value)) +
  geom_point(aes(color = lubridate::week(date)), alpha = 0.7) +
  stat_smooth(method = "lm", se = FALSE, linetype = 1, color = "black") +
  facet_wrap(ews~region, scales = "free", ncol = 4) +
  scale_color_viridis_c(name = "Week of year", limits = c(1,52), 
                        breaks = c(1,26,52)) +
  labs(y = "log(First ratio of EWS)", x = expression(R[E])) +
  theme(legend.position = "top")
```

# Look at lag in dynamics
Due to the stochastic seasonal dynamics, there may be a lag between $R_E$ and EWS that are calculated from raw case data.
To look at this, I will plot $R_E$ and each EWS on the same plot, with peaks in each year highlighted.
These plots will come from simulated series where we know that the $R_E$ and the dynamics are explicitly linked (unlike $R_E$ estimates from particle filtering with the real data).

```{r load-sims}
all_sim_files <- list.files("../../simulations/")
sim_ids <- grep("bootstrap*", all_sim_files)
sim_files <- all_sim_files[sim_ids]

all_sims <- tibble()  # empty tibble for storage
for(do_file in sim_files){
  tmp_sim <- readRDS(paste0("../../simulations/", do_file)) %>%
    filter(sim == 1) %>%
    unnest() %>%
    mutate(city = str_sub(do_file, 16, -5))
  
  all_sims <- bind_rows(all_sims, tmp_sim)
}
```

```{r calc-sim-ews}
sim_ews <- tibble()  # empty storage df
for(do_city in unique(all_sims$city)){
  tmp_data <- all_sims %>%
    filter(city == do_city)
  
  tmp_cases <- tmp_data$reports
  
  tmp_stats <- spaero::get_stats(
    x = tmp_cases,
    center_trend = "local_constant", 
    center_kernel = "uniform", 
    center_bandwidth = 30, 
    stat_trend = "local_constant", 
    stat_kernel = "uniform", 
    stat_bandwidth = 30, 
    lag = 1, 
    backward_only = TRUE
  )$stats
  
  sim_ews <- sim_ews %>%
    bind_rows(
      tmp_data %>% bind_cols(as_tibble(tmp_stats))
    )
}
```

```{r plot-dynamics, fig.height=4, fig.width=10}
sim_ews <- sim_ews %>%
  mutate(critical = ifelse(RE_seas < 1, FALSE, TRUE))

for(do_city in unique(sim_ews$city)){
  tmp <- filter(sim_ews, city == do_city)
  x <- tmp$time
  y <- tmp$RE_seas
  y2 <- log(tmp$variance)
  color <- ifelse(tmp$critical == TRUE, "red", "blue")
  
  par(mar = c(5,5,2,5))
  plot(x, y, type = "n", ylab = expression(italic(R)[E]), xlab = "Date", main = do_city)
  segments(x0 = x, y0 = y, x1 = x+diff(x), y1 = y+diff(y), col = color)
  par(new = T)
  plot(x, y2, type = "l", axes=F, xlab=NA, ylab=NA, lwd = 1.5)
  axis(side = 4)
  mtext(side = 4, line = 3, "Log variance")
}


```